{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
      "  return f(*args, **kwds)\n",
      "/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
      "  return f(*args, **kwds)\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np \n",
    "import pandas as pd \n",
    "import warnings;warnings.filterwarnings('ignore')\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Embedding, LSTM,Flatten\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras.utils.np_utils import to_categorical\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv('train.csv',lineterminator='\\n')\n",
    "df_test = pd.read_csv('test.csv',lineterminator='\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>review</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jo bhi ap se tou behtar hoon</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>ya Allah meri sister Affia ki madad farma</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Yeh khud chahta a is umar main shadi krna.  ha...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Tc ? Apky mun xe exe alfax achy nae lgty 😒💃</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Good</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>American president John f Kennedy aur in ke bh...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>Commission aur kickback ka dor Dora raha, quo...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>Allah pak nazer e bd sy bechye or humesha boha...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>Amoman log samajhte hain ke jhok siyal hi Abid...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>Akki KhanYani k tum ....... v good Wesy tum sh...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>Jail Road Pr Firing Se 1 Shaks Janbahaq Rpt M ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>hud hagai stupid actor hy.....acting ati he na...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>Haha thank you so much</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>Pakistani cricket ki tareekh ka behtreen batsm...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>Nawaz Sharif ko Pakro Harimi Police walo Tum l...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>Kash MERI MAAN ZINDA HOTI...........</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>5 Billion Dollor se zaid akhrajat ka takhmina ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>Kisi larki kay sat selfie lana tum hay maut ki...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>NAMAZ E ZUHAR Hamari Aamdani Mein Izafa Karti ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>Wasy ma sporting lahore ki team ko kr raha tha...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID                                             review  label\n",
       "0    1                       Jo bhi ap se tou behtar hoon      0\n",
       "1    2          ya Allah meri sister Affia ki madad farma      1\n",
       "2    3  Yeh khud chahta a is umar main shadi krna.  ha...      0\n",
       "3    4        Tc ? Apky mun xe exe alfax achy nae lgty 😒💃      0\n",
       "4    5                                               Good      1\n",
       "5    6  American president John f Kennedy aur in ke bh...      0\n",
       "6    7   Commission aur kickback ka dor Dora raha, quo...      0\n",
       "7    8  Allah pak nazer e bd sy bechye or humesha boha...      1\n",
       "8    9  Amoman log samajhte hain ke jhok siyal hi Abid...      1\n",
       "9   10  Akki KhanYani k tum ....... v good Wesy tum sh...      0\n",
       "10  11  Jail Road Pr Firing Se 1 Shaks Janbahaq Rpt M ...      0\n",
       "11  12  hud hagai stupid actor hy.....acting ati he na...      0\n",
       "12  13                            Haha thank you so much       1\n",
       "13  14  Pakistani cricket ki tareekh ka behtreen batsm...      1\n",
       "14  15  Nawaz Sharif ko Pakro Harimi Police walo Tum l...      0\n",
       "15  16               Kash MERI MAAN ZINDA HOTI...........      1\n",
       "16  17  5 Billion Dollor se zaid akhrajat ka takhmina ...      0\n",
       "17  18  Kisi larki kay sat selfie lana tum hay maut ki...      0\n",
       "18  19  NAMAZ E ZUHAR Hamari Aamdani Mein Izafa Karti ...      1\n",
       "19  20  Wasy ma sporting lahore ki team ko kr raha tha...      0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train['label'] = df_train['label'].map({'Negative':0,'Positive':1})\n",
    "df_train.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ID        0\n",
       "review    0\n",
       "label     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#test if nan exists\n",
    "df_train.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ID        0\n",
       "review    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Yaqoob Memon Ki Phansi Zalimana Ghair Insani H...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Sabit qadam rehna</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Good decision on ko bhi aam shehryun ki tarah ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Jo Shakhs ALLAH Aur Qayamat Per Eman Rakhta Ho...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Tm log veriat kohli ko ahmad shahzad k sath co...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID                                             review\n",
       "0   1  Yaqoob Memon Ki Phansi Zalimana Ghair Insani H...\n",
       "1   2                                  Sabit qadam rehna\n",
       "2   3  Good decision on ko bhi aam shehryun ki tarah ...\n",
       "3   4  Jo Shakhs ALLAH Aur Qayamat Per Eman Rakhta Ho...\n",
       "4   5  Tm log veriat kohli ko ahmad shahzad k sath co..."
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ID                                                     2712\n",
       "review    Mery liye dua kry Allah pak naseeb achy kry ameen\n",
       "Name: 2711, dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test.iloc[2711]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "numpy_array = df_train.as_matrix()\n",
    "numpy_array_test = df_test.as_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#two commom ways to clean data\n",
    "def cleaner(word):\n",
    "  word = re.sub(r'\\#\\.', '', word)\n",
    "  word = re.sub(r'\\n', '', word)\n",
    "  word = re.sub(r',', '', word)\n",
    "  word = re.sub(r'\\-', ' ', word)\n",
    "  word = re.sub(r'\\.', '', word)\n",
    "  word = re.sub(r'\\\\', ' ', word)\n",
    "  word = re.sub(r'\\\\x\\.+', '', word)\n",
    "  word = re.sub(r'\\d', '', word)\n",
    "  word = re.sub(r'^_.', '', word)\n",
    "  word = re.sub(r'_', ' ', word)\n",
    "  word = re.sub(r'^ ', '', word)\n",
    "  word = re.sub(r' $', '', word)\n",
    "  word = re.sub(r'\\?', '', word)\n",
    "\n",
    "  return word.lower()\n",
    "def hashing(word):\n",
    "  word = re.sub(r'ain$', r'ein', word)\n",
    "  word = re.sub(r'ai', r'ae', word)\n",
    "  word = re.sub(r'ay$', r'e', word)\n",
    "  word = re.sub(r'ey$', r'e', word)\n",
    "  word = re.sub(r'ie$', r'y', word)\n",
    "  word = re.sub(r'^es', r'is', word)\n",
    "  word = re.sub(r'a+', r'a', word)\n",
    "  word = re.sub(r'j+', r'j', word)\n",
    "  word = re.sub(r'd+', r'd', word)\n",
    "  word = re.sub(r'u', r'o', word)\n",
    "  word = re.sub(r'o+', r'o', word)\n",
    "  word = re.sub(r'ee+', r'i', word)\n",
    "  if not re.match(r'ar', word):\n",
    "    word = re.sub(r'ar', r'r', word)\n",
    "  word = re.sub(r'iy+', r'i', word)\n",
    "  word = re.sub(r'ih+', r'eh', word)\n",
    "  word = re.sub(r's+', r's', word)\n",
    "  if re.search(r'[rst]y', 'word') and word[-1] != 'y':\n",
    "    word = re.sub(r'y', r'i', word)\n",
    "  if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):\n",
    "    word = re.sub(r'i$', r'y', word)\n",
    "  if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):\n",
    "    word = re.sub(r'h', '', word)\n",
    "  word = re.sub(r'k', r'q', word)\n",
    "  return word\n",
    "\n",
    "def array_cleaner(array):\n",
    "  # X = array\n",
    "  X = []\n",
    "  for sentence in array:\n",
    "    clean_sentence = ''\n",
    "    words = sentence.split(' ')\n",
    "    for word in words:\n",
    "      clean_sentence = clean_sentence +' '+ cleaner(word)\n",
    "    X.append(clean_sentence)\n",
    "  return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test = numpy_array_test[:,1]\n",
    "X_train = numpy_array[:, 1]\n",
    "# Clean X here\n",
    "X_train = array_cleaner(X_train)\n",
    "X_test = array_cleaner(X_test)\n",
    "y_train = numpy_array[:, 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6328\n",
      "2712\n",
      "6328\n"
     ]
    }
   ],
   "source": [
    "print(len(X_train))\n",
    "print(len(X_test))\n",
    "print(len(y_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 0, 0, 1, 0], dtype=int8)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train = np.array(y_train)\n",
    "y_train = y_train.astype('int8')\n",
    "y_train[:6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_all = X_train + X_test # Combine both to fit the tokenizer.\n",
    "lentrain = len(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer(\n",
    "    nb_words=2000, \n",
    "                      filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n',\n",
    "                                   lower=True,split=' ')\n",
    "tokenizer.fit_on_texts(X_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[' jo bhi ap se tou behtar hoon', ' ya allah meri sister affia ki madad farma']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_all[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[32, 11, 35, 8, 98, 284, 268], [51, 16, 87, 1, 252, 363]]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = tokenizer.texts_to_sequences(X_all)\n",
    "# X = pad_sequences(X)\n",
    "X[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,  32,  11,  35,   8,  98, 284, 268],\n",
       "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
       "          0,   0,   0,   0,   0,  51,  16,  87,   1, 252, 363]],\n",
       "      dtype=int32)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = pad_sequences(X)\n",
    "X[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(9040, 219)\n"
     ]
    }
   ],
   "source": [
    "print(X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9040"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_1 (Embedding)      (None, 219, 128)          256000    \n",
      "_________________________________________________________________\n",
      "lstm_1 (LSTM)                (None, 219, 256)          394240    \n",
      "_________________________________________________________________\n",
      "flatten_1 (Flatten)          (None, 56064)             0         \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 2)                 112130    \n",
      "=================================================================\n",
      "Total params: 762,370\n",
      "Trainable params: 762,370\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "embed_dim = 128\n",
    "lstm_out = 256\n",
    "batch_size = 32\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Embedding(2000,embed_dim, input_length=X.shape[1],dropout = 0.2))\n",
    "model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2,return_sequences=True))\n",
    "model.add(Flatten())\n",
    "model.add(Dense(2,activation='softmax'))\n",
    "model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])\n",
    "# model.compile(loss = 'binary_crossentropy',optimizer='adam',metrics = ['accuracy'])\n",
    "print(model.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = X[:lentrain] # Separate back into training and test sets. \n",
    "X_test = X[lentrain:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_binary = to_categorical(y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import roc_auc_score\n",
    "from keras.callbacks import Callback\n",
    "\n",
    "class RocAucEvaluation(Callback):\n",
    "    def __init__(self, validation_data=(), interval=1):\n",
    "        super(Callback, self).__init__()\n",
    "        self.interval = interval\n",
    "        self.x_val,self.y_val = validation_data\n",
    "    def on_epoch_end(self, epoch, log={}):\n",
    "        if epoch % self.interval == 0:\n",
    "            y_pred = self.model.predict(self.x_val, verbose=0)\n",
    "            score = roc_auc_score(self.y_val, y_pred)\n",
    "            print('\\n ROC_AUC - epoch:%d - score:%.6f \\n' % (epoch+1, score))\n",
    "#             print(y_pred[:6])\n",
    "x_train5,y_train5,x_label5,y_label5 = train_test_split(X_train,y_binary, train_size=0.8, random_state=234)\n",
    "RocAuc = RocAucEvaluation(validation_data=(y_train5,y_label5), interval=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 5062 samples, validate on 1266 samples\n",
      "Epoch 1/1\n",
      " - 140s - loss: 0.6301 - acc: 0.6213 - val_loss: 0.5292 - val_acc: 0.7425\n",
      "\n",
      " ROC_AUC - epoch:1 - score:0.813401 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "hist = model.fit(x_train5, x_label5, batch_size=batch_size, epochs=1, validation_data=(y_train5, y_label5), callbacks=[RocAuc], verbose=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_lstm = model.predict_proba(X_test,batch_size=batch_size)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2712\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([0.4398137 , 0.5837693 , 0.7322361 , 0.69807845, 0.23353294,\n",
       "       0.8086587 , 0.18370664], dtype=float32)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(len(y_lstm))\n",
    "y_lstm[:7]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "lstm_output = pd.DataFrame(data={\"ID\":df_test[\"ID\"], \"Pred\":y_lstm})\n",
    "lstm_output.to_csv('lstm_new.csv', index = False, quoting = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
